import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import plotly.express as px
import scipy.stats as _stats
import datetime as dt
import itertools
from matplotlib import pyplot as plt
from IPython.core.display import display, HTML
from helper_functions import *
def jupyter_settings():
%matplotlib inline
%pylab inline
plt.style.use( 'bmh' )
plt.rcParams['figure.figsize'] = [20, 12]
plt.rcParams['font.size'] = 24
display( HTML( '<style>.container { width:100% !important; }</style>') )
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200
pd.set_option( 'display.expand_frame_repr', False )
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set()
#warnings.filterwarnings("ignore")
jupyter_settings()
orig_df = pd.read_csv('data/kc_house_data.csv')
house_df_eda = orig_df.copy(deep=True)
house_df_eda.head()
house_df_eda.info()
house_df_eda.describe().T
It is a small sized dataset, and and it will not suffer from the curse of dimensionality.
print('Number of Rows:{}'.format(house_df_eda.shape[0]))
print('Number of Columns {}'.format(house_df_eda.shape[1]))
id - its a unique identifier number for each unique house
date - its the date when the house were sold
price - its the selling price when the house was sold
bedrooms - number of bedrooms in the house
bathrooms - number of bathrooms in the house, where a fraction like 0.25 represents a bathroom sink, shower or toilet
sqft_living - square footage of the apartments interior living space
sqft_lot - the size of the land in square feet
floors - number of floors in the house
waterfront - if there is a waterview from the house
view - an index from 0 to 4 of how good the view of the property was or how many views the house has.
condition - the house preservation condition
grade - a rank from 1 to 13, which ranks the construction quality
sqft_above - the size of the house above the ground level in square feet
sqft_basement - the size of the house below the ground level in square feet
yr_built - the year the house was initially built
yr_renovated - the year of the house's last renovation
zipcode - what zipcode area the house is in
lat - Lattitude
long - Longitude
sqft_living15 - The square footage of interior housing living space for the nearest 15 neighbors (possibly)
sqft_lot15 - The square footage of the land lots of the nearest 15 neighbors
house_df_eda.dtypes
Date reformat type¶
Because it will be handled easily as a date variable type
house_df_eda.date = pd.to_datetime(house_df_eda.date)
house_df_eda.date[0]
num_attributes = house_df_eda.select_dtypes(include=['int64', 'float64'])
num_attributes.columns
mean = pd.DataFrame(num_attributes.mean())
std = pd.DataFrame(num_attributes.std())
median = pd.DataFrame(num_attributes.median())
q1,q3 = pd.DataFrame(num_attributes.quantile([0.25])), pd.DataFrame(num_attributes.quantile([0.75]))
maximum = pd.DataFrame(num_attributes.max())
minimum = pd.DataFrame(num_attributes.min())
skew = pd.DataFrame(num_attributes.skew())
kurtosis = pd.DataFrame(num_attributes.kurtosis())
descriptive_statistics = pd.concat([mean,std,minimum,q1.transpose(),median,q3.transpose(),maximum,skew,kurtosis],axis=1)
descriptive_statistics.columns = ['mean','std','minimum','q1','median','q3','maximum','skew','kurtosis']
descriptive_statistics
Missing Values¶
num_attributes.isnull().sum()
cat_attributes = house_df_eda.select_dtypes(exclude=['int64','float64'])
pd.isna(cat_attributes).sum()
Check Duplicates (Houses sold twice or more times)¶
len(house_df_eda['id'].unique())
duplicated_ids = house_df_eda['id'].apply(lambda x: True if (sum(x == house_df_eda['id']) >= 2)\
else False)
Drop the duplicates but keep the most recent, because for this business purpose it is relevant the houses that are currently on market or the real estate portfolio.
house_df_eda.loc[duplicated_ids,['id','date','price', 'yr_renovated','zipcode']].drop_duplicates(subset=['id'], keep='last')
Geographic Data¶
The geographic data attributes latitude and longitude are in their finest granularity, but zipcode despite being in their purest from in terms of value, it could be transformed into other finer variables to describe specifically the location of the house, for example, the street or a more coarse one as county.
Temporal Data¶
The data attributes yr_built and yr_renovated are interval-based and nominal, respectively. And both have an year level of granularity. The date attribute can be considered as interval-based with a day level of temporal granularity
# house_df_eda['yr_built'].sort_values().unique()
# house_df_eda['yr_renovated'].sort_values().unique()
# house_df_eda['date'].sort_values().unique()
Numerical Data¶
All numeric variables {'id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15'}
have the finest granularity, they are not composed or discretized in any way.
# values = num_attributes['sqft_living'].sort_values().values
# tuplex = plt.hist(values, 20, density=True)
# rows, cols = choose_grid(len(num_attributes.columns))
# iterate_through_data(num_attributes,num_attributes.columns,rows,cols)
# identify_best_fit_distribution(num_attributes,'price')
# identify_best_fit_distribution(num_attributes,'sqft_lot')
# multiple_scatter_plots(house_df_eda.loc[:, house_df_eda.columns != 'id'])
Correlation Analysis¶
It would be important to do some correlation analysis with the derived measures in the second cycle.
The sqft_living and sqft_above have an obvious high positive correlation.
Both sqft_living and sqft_above have an high positive correlation with the grade. So for the renovation question, it would be interesting to see if increasing sqft_living the selling price would also increase.
# fig = plt.figure(figsize=[12, 12])
# corr_mtx = num_attributes.corr()
# sns.heatmap(corr_mtx, xticklabels=corr_mtx.columns, yticklabels=corr_mtx.columns, annot=True, cmap='Blues')
# plt.title('Correlation analysis')
# plt.show()
Outlier Analysis¶
Outliers still not analyzed
# Outlier identification functions
def z_score(data, k_dev):
mean = np.round(data.mean(), decimals=2)
std_dev = np.round(data.std(), decimals=2)
z_scores = [ (x - mean)/std_dev for x in data ]
return data[(np.abs(z_scores) > k_dev)]
def modified_zscore(data, thresh):
median = np.median(data)
median_absolute_deviation = np.median([np.abs(x - median) for x in data])
modified_z_scores = [0.6745 * (x - median) / median_absolute_deviation for x in data]
return data[(np.abs(modified_z_scores) > thresh)]
def iqr(data, dist):
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
return data[(data < (Q1 - (dist * IQR))) | (data > (Q3 + (dist * IQR)))]
house_df_eda.loc[:5, ~house_df_eda.columns.isin(['id','date'])]
outliers = {}
for feature in num_attributes.loc[:, ~num_attributes.columns.isin(['id','date'])].columns:
# print(f"{feature} outliers", end="\n")
# print(iqr(num_attributes[feature], 1.5))
outliers[feature] = iqr(num_attributes[feature], 1.5)
outliers['bedrooms'].unique()
Some inconsitent records or records with inconsistent values must be removed if they are not relevant outliers, because they will bias the analysis and future model training.
house_df_eda_inconsistency = house_df_eda.copy(deep=True)
house_df_eda_inconsistency.loc[house_df_eda_inconsistency['bedrooms'] == 0]
house_df_eda_inconsistency.loc[house_df_eda_inconsistency['bedrooms'] == 33]
house_df_eda_inconsistency.yr_renovated.sort_values().unique()
len(house_df_eda_inconsistency.loc[house_df_eda_inconsistency['yr_renovated'] == 0])
house_df_eda_inconsistency['living_area_above_and_basement'] = house_df_eda_inconsistency['sqft_living'] - (house_df_eda_inconsistency['sqft_above'] + house_df_eda_inconsistency['sqft_basement'])
house_df_eda_inconsistency.loc[house_df_eda_inconsistency['living_area_above_and_basement'] < 0]
bedrooms
There are some houses with 0 bedrooms, but may be houses with no purpose of living.
There is some house record with 33 bedrooms with only 1040 square feet of infraestructure size
sqft_living - (sqft_above - sqft_basement)
It wasn't found any inconsistency
It is important to do some business data assumption to better analyze the data and take interpretable insights.
Region granularity
There is some difference between sub-regions in King County, so there must be a separation between regions, and have a specific analysis for each. Since a generalized analysis would nullify the correlation of each region, and would be harder to take insights. This region segmentation can be made in different granularities, that is it can be divided by North, South, and East Mountains, but it can also have a finer granularity that is for zipcode regions. Another finer granularity can be the street or street block by collecting that data from the latitude and longitude, and counting the number of houses per region to assess if there is enough data for each street to get significant statistical data.
Bedrooms inconsistency
Houses with 0 bedrooms, may be for other purposes apart from living usage. (Confirm all records consistency, analysing one by one or doing some summary statistics)
Year Renovated inconsistency
It can be assumed that houses with value 0 on yr_renovated is basically houses that have not been renovated yet.
Condition vs Grade relevance
The condition variable it seems to have a higher weight in long-term than grade, in terms of evaluating which houses are better to buy, since the grade is the infraestructure construction quality and it cannot change so much with time, unlike condition that is the house preservation and it can it can increase with some maintenance works.
First intuition without looking to EDA, and only based on the business understanding. (On the second cycle will be based on the EDA)
Business: Buy and sell houses in Real Estate market, with some profit.
Business problems:
1. Which houses should be bought and for what price?
2. Once its bought when it's the best time period to sell it and for what price?
3. To rise the housing selling price, the company should do a renovation. So what would be good renewal changes?
This tasks can easily be done as a Classification Task, after doing the labelling task of giving a label to each record, telling if the house should be bought or not. This labelling process comes with some business assumptions of good profit values and empirical rules, and by doing a preliminary analysis we get our first actionable insights that will be presented to business experts. After validating this labelling empirical process, we can train a classification model, to make the decision-making more robust and less biased by the history.
Useful Attributes for business question:
1. date, price, condition, grade, zipcode
2. date, price, condition, grade, zipcode
3. date, price, bedrooms, bathrooms, sqft_living, sqft_above
house_df_fs = house_df_eda[['id','date','price','condition','grade','zipcode','bedrooms','bathrooms','sqft_living','sqft_above','sqft_basement','sqft_lot']]
(Check planning document)
Useful Attributes for business question:
1. date, price, condition, grade, zipcode, plus:
1. median_price - based on zipcode region of the selected houses
2. percentage_value_below_median
3. house_total_m2 - m2_living + m2_lot
4. price/house_total_m2 - will help more obtaining more informative insights on comparisons, since we are normalizing the
price by the house size, and then there is a more fair comparison.
2. From the selected houses to buy create and use:
1. best_season - based on zipcode region of the selected houses, and its selling date
2. selling_price - based on the price and the season
3. profit - will result from difference between selling_price and price
3. From the selected houses to buy create and use:
1.
2.
house_df_fs[['m2_living','m2_above','m2_basement','m2_lot']] = house_df_fs[['sqft_living','sqft_above','sqft_basement','sqft_lot']] * 0.0929
house_df_fs.drop(['sqft_living','sqft_above','sqft_basement','sqft_lot'], axis=1, inplace=True)
house_df_fs.head()
house_df_fs['house_total_m2'] = house_df_fs['m2_living'] + house_df_fs['m2_lot']
house_df_fs.head()
1. The houses that have a price value lower than the median and are in good conditions, can be sold for a higher price, so are good to buy.
2. The houses that have a price value lower than the median and are in bad conditions, cannot be sold for a higher price, so are not good to buy.
3. The houses that have a price value higher than the median, independently from the condition, are not good to buy and take profit.
house_df_fs['price_m2'] = house_df_fs['price']/house_df_fs['house_total_m2']
zipcode_median = house_df_fs[['price_m2', 'zipcode']].groupby('zipcode').median().reset_index()
zipcode_median.columns = ['zipcode','median_price_m2']
house_df_fs = pd.merge(house_df_fs,zipcode_median,on='zipcode',how='inner')
house_df_fs.head()
house_df_fs['condition'].value_counts()
house_df_fs['status'] = house_df_fs[['price_m2','condition','median_price_m2']].apply(lambda x: 'buy' if (x[0] < x[2]) & (x[1] >= 3)\
else 'do not buy', axis = 1)
house_df_fs[['id','zipcode','price_m2','median_price_m2','condition','status']].head()
Order by percentage and then by condition
# percentage_value_below_median
house_df_eda['renovated'] = house_df_eda['yr_renovated'].apply(lambda x: 1 if x!=0 else 0)
house_df_eda['renovated'].value_counts()
house_df_eda['region'] = house_df_eda['lat'].apply(lambda x: 'North' if x > 47.46 else 'South')
sampleNorth = house_df_eda[house_df_eda['region'] == 'North'].sample(100)
sampleSouth = house_df_eda[house_df_eda['region'] == 'South'].sample(100)
house_df_sampled = pd.concat([sampleNorth,sampleSouth])
house_df_sampled['price_cube'] = house_df_sampled['price']**1.5
data_map = house_df_sampled[['id','zipcode', 'lat', 'long', 'price','bedrooms','renovated','region','price_cube']]
map = px.scatter_mapbox(data_map, lat='lat', lon='long',
hover_name='id',
hover_data=['price'],
size= 'price_cube',
color='region',
color_discrete_sequence=['red','green'],
size_max=30,
zoom=10,
height=300)
map.update_layout(mapbox_style='open-street-map')
map.update_layout(height=600, margin={'r':0, 't':0, 'l':0, 'b':0})
map.show()